{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# TASK 2 (2017-2019) - First Difference\n",
    "\n",
    "\n",
    "import sys\n",
    "import os\n",
    "import logging\n",
    "\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from datetime import datetime\n",
    "\n",
    "from sklearn.ensemble import RandomForestRegressor\n",
    "from joblib import dump, load\n",
    "from matplotlib import pyplot as plt\n",
    "\n",
    "import views\n",
    "from views import Ensemble, Model, Downsampling, Period\n",
    "from views.utils.data import assign_into_df\n",
    "from views.apps.transforms import lib as translib\n",
    "from views.apps.evaluation import lib as evallib, feature_importance as fi\n",
    "from views.apps.model import api\n",
    "from views.apps.extras import extras"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset = views.DATASETS[\"cm_africa_imp_0\"]\n",
    "df = dataset.df\n",
    "#print(df)\n",
    "level = \"cm\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "model_path = \"./models/eval_real/{sub}\"\n",
    "out_paths = {\n",
    "    \"evaluation2\": model_path.format(sub=\"evaluation2\"),\n",
    "    \"features2\": model_path.format(sub=\"features2\")\n",
    "}\n",
    "for k, v in out_paths.items():\n",
    "    if not os.path.isdir(v):\n",
    "        os.makedirs(v)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import our data\n",
    "wiki = pd.read_csv(r'/home/default/Dokumente/TRINITY/comp/prediction-project/data/Wiki_Data_Final_2021.csv')\n",
    "multi = wiki.set_index(['month_id', 'country_id'])\n",
    "multi = multi.sort_index(level=0)\n",
    "\n",
    "#print(multi)\n",
    "df = df.merge(multi, on=['month_id', 'country_id'], how='left')\n",
    "#df.sort_index(level=1)\n",
    "df = df.fillna(0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Partition Dataset in Training, Test and Calibration set\n",
    "period_calib = api.Period(\n",
    "   name=\"calib\", \n",
    "   train_start=121,   # 1990-01\n",
    "   train_end=408,     # 2013.12\n",
    "   predict_start=409, # 2014.01\n",
    "   predict_end=443,   # 2016.11\n",
    ")\n",
    "\n",
    "\n",
    "period_test = api.Period(\n",
    "   name=\"test\", \n",
    "   train_start=121,   # 1990-01\n",
    "   train_end=443,     # 2016.11\n",
    "   predict_start=445, # 2017.01\n",
    "   predict_end=480,   # 2019.12\n",
    ")\n",
    "\n",
    "\n",
    "periods = [period_calib, period_test]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# The steps to train, predict and evaluate for.\n",
    "steps = [2,3,4,5,6,7]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Subset \"most important\" features from the ViEWS-dataset and Wikipedia variables\n",
    "\n",
    "cols_features = df[['ged_best_sb','wdi_vc_btl_deth', 'reign_precip','reign_prev_conflict','reign_tenure_months',\n",
    "           'reign_irregular','reign_lastelection','reign_loss','reign_pctile_risk','reign_couprisk',\n",
    "          'ged_count_sb','ged_best_os','ged_count_os','wdi_eg_use_pcap_kg_oe','ged_best_ns',\n",
    "           'vdem_v2x_accountability', 'wdi_nv_srv_totl_zs','wdi_sp_pop_totl','wdi_sl_tlf_totl_fe_zs',\n",
    "          'wdi_sm_pop_totl_zs', 'wdi_sm_pop_refg_or', 'wdi_dt_oda_odat_pc_zs', 'ged_count_ns', \n",
    "           'reign_gov_dominant_party', 'reign_age', 'vdem_v2xpe_exlpol', 'wdi_ag_lnd_frst_k2', \n",
    "           'wdi_bg_gsr_nfsv_gd_zs', 'wdi_st_int_rcpt_xp_zs', 'vdem_v2x_clpol', 'vdem_v2x_genpp',\n",
    "                    'diffEN','diffFR']]\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Specify number of estimators in Random Forest\n",
    "n_estimators = 200"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Train Model\n",
    "\n",
    "task2_delta = api.Model(\n",
    "    name=\"task2_delta\",                \n",
    "    col_outcome=\"ln_ged_best_sb\",    \n",
    "    cols_features=cols_features,     \n",
    "    steps=steps,                     \n",
    "    outcome_type=\"real\",             \n",
    "    periods=periods,                 \n",
    "    estimator=RandomForestRegressor( \n",
    "        n_estimators=n_estimators,\n",
    "        criterion=\"mse\",\n",
    "        n_jobs=-1,\n",
    "    ),\n",
    "    delta_outcome=True,            \n",
    ")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "models = [task2_delta]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 15min 31s, sys: 15.6 s, total: 15min 46s\n",
      "Wall time: 15min 47s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "# Train all models\n",
    "for model in models:\n",
    "    model.fit_estimators(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Store predictions and calibrated predictions for all models in our dataframe\n",
    "for model in models:\n",
    "    df_predictions = model.predict(df)\n",
    "    df = assign_into_df(df, df_predictions)\n",
    "    df_predictions = model.predict_calibrated(\n",
    "        df=df,\n",
    "        period_calib = period_calib,\n",
    "        period_test = period_test\n",
    "    )\n",
    "    df = assign_into_df(df, df_predictions)\n",
    "  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Evaluate all models. Scores are stored in the model object\n",
    "for model in models:\n",
    "    model.evaluate(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Wrote scores table to ./models/eval_real/evaluation2/task2_delta_cm_uncalibrated_scores.tex.\n"
     ]
    }
   ],
   "source": [
    "# Select test partition and compute model performance\n",
    "\n",
    "partition = \"test\"\n",
    "\n",
    "for model in models:\n",
    "    for calib in [\"uncalibrated\"]:\n",
    "        scores = {\n",
    "            \"Step\":[], \n",
    "            \"MSE\":[]\n",
    "        }\n",
    "        if model.delta_outcome:\n",
    "            scores.update({\"TADDA\":[]}) \n",
    "            \n",
    "        for key, value in model.scores[partition].items():\n",
    "            if key != \"sc\":\n",
    "                scores[\"Step\"].append(key)\n",
    "                scores[\"MSE\"].append(value[calib][\"mse\"])\n",
    "                if model.delta_outcome:\n",
    "                    scores[\"TADDA\"].append(value[calib][\"tadda_score\"])\n",
    "\n",
    "        out = pd.DataFrame(scores)\n",
    "        tex = out.to_latex(index=False)\n",
    "\n",
    "        # Add meta.\n",
    "        now = datetime.now().strftime(\"%Y/%m/%d %H:%M:%S\")\n",
    "        meta = f\"\"\"\n",
    "        %Output created by wb_models.ipynb.\n",
    "        %Evaluation of {model.col_outcome} per step.\n",
    "        %Run on selected {model.name} features at {level} level.\n",
    "        %Produced on {now}, written to {out_paths[\"evaluation2\"]}.\n",
    "        \\\\\n",
    "        \"\"\"\n",
    "        tex = meta + tex\n",
    "        path_out = os.path.join(\n",
    "            out_paths[\"evaluation2\"], \n",
    "            f\"{model.name}_{level}_{calib}_scores.tex\"\n",
    "        )\n",
    "        with open(path_out, \"w\") as f:\n",
    "            f.write(tex)\n",
    "        print(f\"Wrote scores table to {path_out}.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "   Step       MSE     TADDA\n",
      "0     2  0.729436  0.558030\n",
      "1     3  0.733907  0.551119\n",
      "2     4  0.745023  0.545737\n",
      "3     5  0.825394  0.588279\n",
      "4     6  0.806698  0.588165\n",
      "5     7  0.814380  0.583513\n"
     ]
    }
   ],
   "source": [
    "print(out)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
